library(corrplot)
## corrplot 0.95 loaded
library(lubridate)
##
## Attachement du package : 'lubridate'
## Les objets suivants sont masqués depuis 'package:base':
##
## date, intersect, setdiff, union
library(zoo)
##
## Attachement du package : 'zoo'
## Les objets suivants sont masqués depuis 'package:base':
##
## as.Date, as.Date.numeric
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.0
## ✔ purrr 1.2.1 ✔ tidyr 1.3.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(survival)
library(asaur)
library(broom)
library(survminer)
## Le chargement a nécessité le package : ggpubr
##
## Attachement du package : 'survminer'
##
## L'objet suivant est masqué depuis 'package:survival':
##
## myeloma
library(ggfortify)
library(gtsummary)
library(ggplot2)
library(GGally)
library(ggsurvfit)
data_orig = read.csv('../../database_1perDay_isWarm8.csv', sep=',', header=T)
cat("--- Dimension of the dataset: ", dim(data_orig), "\n")
## --- Dimension of the dataset: 707 15
cat("--- Number of missing values: ", sum(is.na(data_orig)), "\n") # sum=0 -> no missing values
## --- Number of missing values: 276
head(data_orig)
## id start stop event isWarm latitude longitude temperature windSpeed pressure
## 1 1 0 88 1 0 47.5 6.95 15.6 7 97510
## 2 2 0 79 0 1 47.5 6.95 13.9 5 98170
## 3 2 79 89 1 1 47.5 6.95 19.8 2 99260
## 4 3 0 83 1 0 47.5 6.95 10.0 3 96790
## 5 4 0 83 1 0 47.5 6.95 16.1 2 99110
## 6 5 0 163 0 1 47.5 6.95 26.7 5 98860
## humidity visibility nebulosity cloudHeight moonPhase
## 1 52 40000 75 1250 72
## 2 58 40000 90 800 44
## 3 42 50000 90 1750 17
## 4 53 20000 10 1250 62
## 5 53 7000 10 NA 77
## 6 49 35000 90 1750 4
tbl_summary(data_orig)
| Characteristic | N = 7071 |
|---|---|
| id | 251 (119, 377) |
| start | 0 (0, 157) |
| stop | 157 (116, 185) |
| event | 409 (58%) |
| isWarm | 596 (84%) |
| latitude | |
| Â Â Â Â 47.5 | 79 (11%) |
| Â Â Â Â 47.69 | 64 (9.1%) |
| Â Â Â Â 47.88 | 83 (12%) |
| Â Â Â Â 48.06 | 103 (15%) |
| Â Â Â Â 48.25 | 67 (9.5%) |
| Â Â Â Â 48.44 | 58 (8.2%) |
| Â Â Â Â 48.62 | 82 (12%) |
| Â Â Â Â 48.81 | 98 (14%) |
| Â Â Â Â 49 | 73 (10%) |
| longitude | |
| Â Â Â Â 6.95 | 72 (10%) |
| Â Â Â Â 7.15 | 121 (17%) |
| Â Â Â Â 7.35 | 157 (22%) |
| Â Â Â Â 7.55 | 126 (18%) |
| Â Â Â Â 7.75 | 113 (16%) |
| Â Â Â Â 7.95 | 56 (7.9%) |
| Â Â Â Â 8.15 | 62 (8.8%) |
| temperature | 21 (16, 25) |
| windSpeed | 3.00 (2.00, 4.00) |
| pressure | 98,960 (98,400, 99,770) |
| humidity | 52 (42, 64) |
| visibility | 25,000 (15,050, 40,000) |
| Â Â Â Â Unknown | 2 |
| nebulosity | 90 (60, 90) |
| Â Â Â Â Unknown | 152 |
| cloudHeight | 1,250 (800, 1,750) |
| Â Â Â Â Unknown | 110 |
| moonPhase | 56 (15, 86) |
| Â Â Â Â Unknown | 12 |
| 1 Median (Q1, Q3); n (%) | |
data_clean <- data_orig
# Add a season variable to allow the risk to vary with seasons:
data_clean$season <- factor(quarters(as.Date(data_orig$stop)))
barplot(prop.table(table(data_clean$season)))
The ciconia can fly up to 4 800m high so limit the value of cloudHeight to 3 000
data_clean$cloudHeight[data_clean$cloudHeight>2000] <- 2000
par(mfrow=c(2,2), mai = c(0.6, 0.6, 0.1, 0.1))
hist(data_orig$cloudHeight, breaks=20)
hist(data_clean$cloudHeight, breaks=10)
The ciconia has difficulties flying with winds higher than 3-5m/s
data_clean$windSpeed[data_clean$windSpeed>5] <- 5
par(mfrow=c(2,2), mai = c(0.6, 0.6, 0.1, 0.1))
hist(data_orig$windSpeed)
hist(data_clean$windSpeed)
Remove data with observation time after day 212 (1 august) as they are usually observed much earlier. They are bad measurements ?
ids <- unique(data_clean$id[data_clean$stop>212])
ids
## [1] 31 45 46 51 115 139 144 149 151 155 166 174 191 229 230 246 271 287 289
## [20] 292 320 337 360 368 387 403 407 420 428 429 458
data_clean <- data_clean[! data_clean$id %in% ids,]
par(mfrow=c(2,2), mai = c(0.6, 0.6, 0.1, 0.1))
plot(data_orig$stop, data_orig$temperature)
hist(data_orig$stop, breaks=30)
plot(data_clean$stop, data_clean$temperature)
hist(data_clean$stop, breaks=30)
# Normalize the non-boolean data for all attributes to have mean=0 and std=1.
# with or without normalizing binary data does not affect the result !!
data <- data_clean
data[c(6:15)] <- lapply(data[c(6:15)], function(x) c(scale(x)))
data[is.na(data)] <- 0
summary(data)
## id start stop event
## Min. : 1.0 Min. : -7.00 Min. : -7.0 Min. :0.000
## 1st Qu.:116.0 1st Qu.: 0.00 1st Qu.:114.0 1st Qu.:0.000
## Median :252.0 Median : 0.00 Median :152.0 Median :1.000
## Mean :250.2 Mean : 63.55 Mean :140.6 Mean :0.586
## 3rd Qu.:378.0 3rd Qu.:152.00 3rd Qu.:173.0 3rd Qu.:1.000
## Max. :503.0 Max. :211.00 Max. :212.0 Max. :1.000
## isWarm latitude longitude temperature
## Min. :0.0000 Min. :-1.55631 Min. :-1.5585 Min. :-3.4443
## 1st Qu.:1.0000 1st Qu.:-0.77687 1st Qu.:-0.9811 1st Qu.:-0.6353
## Median :1.0000 Median :-0.01794 Median : 0.1737 Median : 0.0779
## Mean :0.8279 Mean : 0.00000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.:1.0000 3rd Qu.: 0.74099 3rd Qu.: 0.7511 3rd Qu.: 0.7620
## Max. :1.0000 Max. : 1.52044 Max. : 1.9059 Max. : 2.3047
## windSpeed pressure humidity visibility
## Min. :-2.1648 Min. :-3.26051 Min. :-2.0011 Min. :-1.8175
## 1st Qu.:-0.7458 1st Qu.:-0.74509 1st Qu.:-0.7689 1st Qu.:-0.8595
## Median :-0.0363 Median :-0.08085 Median :-0.1528 Median :-0.1905
## Mean : 0.0000 Mean : 0.00000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.6732 3rd Qu.: 0.76851 3rd Qu.: 0.5865 3rd Qu.: 0.8131
## Max. : 1.3827 Max. : 2.66324 Max. : 2.8044 Max. : 3.1547
## nebulosity cloudHeight moonPhase season
## Min. :-2.50903 Min. :-2.39169 Min. :-1.4763 Q1: 96
## 1st Qu.:-0.42996 1st Qu.:-0.91927 1st Qu.:-1.0491 Q2:416
## Median : 0.08981 Median :-0.05877 Median : 0.0899 Q3:131
## Mean : 0.00000 Mean : 0.00000 Mean : 0.0000 Q4: 2
## 3rd Qu.: 0.60957 3rd Qu.: 0.89735 3rd Qu.: 0.9442
## Max. : 0.99074 Max. : 1.37541 Max. : 1.3713
head(data)
## id start stop event isWarm latitude longitude temperature windSpeed
## 1 1 0 88 1 0 -1.556309 -1.55854 -0.620721914 1.3827075
## 2 2 0 79 0 1 -1.556309 -1.55854 -0.868148691 1.3827075
## 3 2 79 89 1 1 -1.556309 -1.55854 -0.009432229 -0.7458041
## 4 3 0 83 1 0 -1.556309 -1.55854 -1.435774827 -0.0363002
## 5 4 0 83 1 0 -1.556309 -1.55854 -0.547949332 -0.7458041
## 6 5 0 163 0 1 -1.556309 -1.55854 0.994829396 1.3827075
## pressure humidity visibility nebulosity cloudHeight moonPhase season
## 1 -1.70334930 -0.15282675 0.8130947 0.08980589 -0.05876531 0.5739934 Q1
## 2 -0.98465860 0.21682295 0.8130947 0.60957289 -0.91926988 -0.2233388 Q1
## 3 0.20226999 -0.76890959 1.4821295 0.60957289 0.89735087 -0.9921949 Q1
## 4 -2.48737552 -0.09121847 -0.5249748 -2.16251774 -0.05876531 0.2892319 Q1
## 5 0.03893119 -0.09121847 -1.3947201 -2.16251774 0.00000000 0.7163741 Q1
## 6 -0.23330014 -0.33765160 0.4785773 0.60957289 0.89735087 -1.3623849 Q2
## See linear variation of temperature and time:
plot(data$stop, data$temperature, lwd=2)
plot(data$stop, data$temperature/data$stop)
abline(h=0.05, lty=2)
abline(h=-0.2, lty=2)
# remove outliers:
data <- data[data$temperature/data$stop <= 0.05,]
data <- data[data$temperature/data$stop >= -0.2,]
old.par = par(mar = c(6, 5, 2, 2))
par(mfrow=c(3,2), mai = c(0.6, 0.6, 0.1, 0.1))
plot(data$stop, data$temperature, lwd=2, xlab="Days to event", ylab="Temperature [°C]")
boxplot(data$temperature, ylab="Temperature [°C]")
plot(data$stop, data$windSpeed , lwd=2, xlab="Days to event", ylab="Wind Speed [m/s]")
boxplot(data$windSpeed, ylab="Wind Speed [m/s]")
plot(data$stop, data$pressure , lwd=2, xlab="Days to event", ylab="Pressure [Pa]")
boxplot(data$pressure, ylab="Pressure [Pa]")
par(mfrow=c(3,2), mai = c(0.6, 0.6, 0.1, 0.1))
plot(data$stop, data$humidity , lwd=2, xlab="Days to event", ylab="Humidity [%]")
boxplot(data$humidity, ylab="Humidity [%]")
plot(data$stop, data$visibility , lwd=2, xlab="Days to event", ylab="Visibility [%]")
boxplot(data$visibility, ylab="Visibility [%]")
plot(data$stop, data$nebulosity , lwd=2, xlab="Days to event", ylab="Nebulosiity [%]")
boxplot(data$nebulosity, ylab="Nebulosity [%]")
par(mfrow=c(3,2), mai = c(0.6, 0.6, 0.1, 0.1))
plot(data$stop, data$cloudHeight , lwd=2, xlab="Days to event", ylab="Cloud Height [m]")
boxplot(data$cloudHeight, ylab="Cloud Height [m]")
plot(data$stop, data$moonPhase , lwd=2, xlab="Days to event", ylab="Moon Phase [%]")
boxplot(data$moonPhase, ylab="Moon Phase [%]")
hist(data$isWarm)
hist(data$event)
ggplot(stack(data[, !names(data) %in% c("stop", "start", "id", "event", "season", "isWarm")]), aes(x=ind, y=values)) + geom_boxplot() + coord_flip()
old.par = par(mar = c(6, 5, 2, 2))
par(mfrow=c(3,3), mai = c(0.6, 0.6, 0.1, 0.1))
plot(data$stop, data$temperature, lwd=2, xlab="Days to event", ylab="Temperature [°C]")
plot(data$stop, data$windSpeed , lwd=2, xlab="Days to event", ylab="Wind Speed [m/s]")
plot(data$stop, data$pressure , lwd=2, xlab="Days to event", ylab="Pressure [Pa]")
plot(data$stop, data$humidity , lwd=2, xlab="Days to event", ylab="Humidity [%]")
plot(data$stop, data$visibility , lwd=2, xlab="Days to event", ylab="Visibility [%]")
plot(data$stop, data$nebulosity , lwd=2, xlab="Days to event", ylab="Nebulosiity [%]")
plot(data$stop, data$cloudHeight , lwd=2, xlab="Days to event", ylab="Cloud Height [m]")
plot(data$stop, data$moonPhase , lwd=2, xlab="Days to event", ylab="Moon Phase [%]")
par(mfrow=c(1,1))
corrplot(cor(data[1:15]), tl.col="black", main="Correlation Matrix")
Mnone <- coxph(Surv(start, stop, event)~1 , data=data)
Mtemp <- coxph(Surv(start, stop, event)~temperature, data=data)
Mwarm <- coxph(Surv(start, stop, event)~isWarm , data=data)
Mwind <- coxph(Surv(start, stop, event)~windSpeed , data=data)
Mcloud <- coxph(Surv(start, stop, event)~cloudHeight, data=data)
cat("--- Mtemp ---\n\n")
## --- Mtemp ---
summary(Mtemp)
## Call:
## coxph(formula = Surv(start, stop, event) ~ temperature, data = data)
##
## n= 641, number of events= 376
##
## coef exp(coef) se(coef) z Pr(>|z|)
## temperature -0.90967 0.40266 0.06867 -13.25 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## temperature 0.4027 2.484 0.352 0.4607
##
## Concordance= 0.751 (se = 0.016 )
## Likelihood ratio test= 179.9 on 1 df, p=<2e-16
## Wald test = 175.5 on 1 df, p=<2e-16
## Score (logrank) test = 182.3 on 1 df, p=<2e-16
# p-value < 0.01 --> reject H0. Difference between Mnone and Mtemp is higly significant -> should keep temperature
cat("--- Mwarm ---\n\n")
## --- Mwarm ---
summary(Mwarm)
## Call:
## coxph(formula = Surv(start, stop, event) ~ isWarm, data = data)
##
## n= 641, number of events= 376
##
## coef exp(coef) se(coef) z Pr(>|z|)
## isWarm -3.2649 0.0382 0.1973 -16.55 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## isWarm 0.0382 26.18 0.02595 0.05624
##
## Concordance= 0.695 (se = 0.012 )
## Likelihood ratio test= 304.4 on 1 df, p=<2e-16
## Wald test = 273.8 on 1 df, p=<2e-16
## Score (logrank) test = 463.4 on 1 df, p=<2e-16
# p-value < 0.01 --> reject H0. Difference between Mnone and Mwarm is highly significant -> should keep isWarm
cat("--- Mwind ---\n\n")
## --- Mwind ---
summary(Mwind)
## Call:
## coxph(formula = Surv(start, stop, event) ~ windSpeed, data = data)
##
## n= 641, number of events= 376
##
## coef exp(coef) se(coef) z Pr(>|z|)
## windSpeed -0.11485 0.89150 0.05268 -2.18 0.0293 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## windSpeed 0.8915 1.122 0.804 0.9885
##
## Concordance= 0.544 (se = 0.022 )
## Likelihood ratio test= 4.78 on 1 df, p=0.03
## Wald test = 4.75 on 1 df, p=0.03
## Score (logrank) test = 4.77 on 1 df, p=0.03
# p-value < 0.05 --> reject H0. Difference between Mnone and Mwind is significant -> should keep windSpeed
cat("--- Mcloud ---\n\n")
## --- Mcloud ---
summary(Mcloud)
## Call:
## coxph(formula = Surv(start, stop, event) ~ cloudHeight, data = data)
##
## n= 641, number of events= 376
##
## coef exp(coef) se(coef) z Pr(>|z|)
## cloudHeight 0.11946 1.12689 0.06012 1.987 0.0469 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## cloudHeight 1.127 0.8874 1.002 1.268
##
## Concordance= 0.53 (se = 0.023 )
## Likelihood ratio test= 3.98 on 1 df, p=0.05
## Wald test = 3.95 on 1 df, p=0.05
## Score (logrank) test = 3.95 on 1 df, p=0.05
# p-value < 0.05 --> reject H0. Difference between Mnone and Mcloud is significant -> should keep cloudHeight
# Regarding the correlation Matrix, Temperature, warmBefore are the most significant covariates
MtempWarm <- coxph(Surv(start, stop, event)~temperature + isWarm + strata(season) , data=data)
MtempWarmWindCloud <- coxph(Surv(start, stop, event)~temperature + isWarm + strata(season) + windSpeed + cloudHeight , data=data)
Mall <- coxph(Surv(start, stop, event)~latitude + longitude + isWarm + temperature + windSpeed + pressure + humidity + visibility + nebulosity + cloudHeight + moonPhase + strata(season), data=data)
anova(MtempWarm, Mall)
## Analysis of Deviance Table
## Cox model: response is Surv(start, stop, event)
## Model 1: ~ temperature + isWarm + strata(season)
## Model 2: ~ latitude + longitude + isWarm + temperature + windSpeed + pressure + humidity + visibility + nebulosity + cloudHeight + moonPhase + strata(season)
## loglik Chisq Df Pr(>|Chi|)
## 1 -1470.7
## 2 -1435.0 71.436 9 7.959e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p-value < 0.05 --> reject H0. Difference between the models is highly significant -> should keep more covariates than just Temperature and isWarm
anova(MtempWarmWindCloud, Mall)
## Analysis of Deviance Table
## Cox model: response is Surv(start, stop, event)
## Model 1: ~ temperature + isWarm + strata(season) + windSpeed + cloudHeight
## Model 2: ~ latitude + longitude + isWarm + temperature + windSpeed + pressure + humidity + visibility + nebulosity + cloudHeight + moonPhase + strata(season)
## loglik Chisq Df Pr(>|Chi|)
## 1 -1450.7
## 2 -1435.0 31.517 7 4.989e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p-value = 0.01257, < 0.05 --> reject H0. Difference between the models is significant -> more covariates are necessary to explain the data
# Model selection using all covariates:
MAIC <- step(Mall)
## Start: AIC=2891.96
## Surv(start, stop, event) ~ latitude + longitude + isWarm + temperature +
## windSpeed + pressure + humidity + visibility + nebulosity +
## cloudHeight + moonPhase + strata(season)
##
##
## Step: AIC=3270.68
## Surv(start, stop, event) ~ latitude + longitude + isWarm + temperature +
## windSpeed + pressure + humidity + visibility + nebulosity +
## cloudHeight + moonPhase
summary(MAIC)
## Call:
## coxph(formula = Surv(start, stop, event) ~ latitude + longitude +
## isWarm + temperature + windSpeed + pressure + humidity +
## visibility + nebulosity + cloudHeight + moonPhase, data = data)
##
## n= 641, number of events= 376
##
## coef exp(coef) se(coef) z Pr(>|z|)
## latitude 0.04566 1.04671 0.08351 0.547 0.584588
## longitude 0.16842 1.18343 0.06528 2.580 0.009880 **
## isWarm -2.07290 0.12582 0.23488 -8.825 < 2e-16 ***
## temperature -1.18656 0.30527 0.09937 -11.941 < 2e-16 ***
## windSpeed -0.19953 0.81912 0.05703 -3.499 0.000467 ***
## pressure -0.14994 0.86076 0.07606 -1.971 0.048687 *
## humidity -0.40031 0.67011 0.09945 -4.025 5.69e-05 ***
## visibility -0.22837 0.79583 0.06243 -3.658 0.000254 ***
## nebulosity -0.10825 0.89740 0.06749 -1.604 0.108716
## cloudHeight 0.22859 1.25683 0.08634 2.648 0.008106 **
## moonPhase 0.04129 1.04216 0.05335 0.774 0.438883
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## latitude 1.0467 0.9554 0.8887 1.2329
## longitude 1.1834 0.8450 1.0413 1.3450
## isWarm 0.1258 7.9479 0.0794 0.1994
## temperature 0.3053 3.2758 0.2512 0.3709
## windSpeed 0.8191 1.2208 0.7325 0.9160
## pressure 0.8608 1.1618 0.7416 0.9991
## humidity 0.6701 1.4923 0.5514 0.8143
## visibility 0.7958 1.2566 0.7042 0.8994
## nebulosity 0.8974 1.1143 0.7862 1.0243
## cloudHeight 1.2568 0.7957 1.0612 1.4886
## moonPhase 1.0422 0.9595 0.9387 1.1570
##
## Concordance= 0.829 (se = 0.012 )
## Likelihood ratio test= 466.1 on 11 df, p=<2e-16
## Wald test = 408.9 on 11 df, p=<2e-16
## Score (logrank) test = 619.1 on 11 df, p=<2e-16
# MoonPhase, nebulosity, pressure and latitude : not necessary at all
# 11 degrees of freedom
Mbest <- coxph(Surv(start, stop, event)~longitude + latitude + pressure + temperature + isWarm + windSpeed + humidity + visibility + cloudHeight + strata(season), data=data)
summary(Mbest)
## Call:
## coxph(formula = Surv(start, stop, event) ~ longitude + latitude +
## pressure + temperature + isWarm + windSpeed + humidity +
## visibility + cloudHeight + strata(season), data = data)
##
## n= 641, number of events= 376
##
## coef exp(coef) se(coef) z Pr(>|z|)
## longitude 0.11459 1.12142 0.06839 1.676 0.093812 .
## latitude 0.20656 1.22944 0.08176 2.526 0.011522 *
## pressure -0.19490 0.82292 0.07478 -2.606 0.009148 **
## temperature -0.85406 0.42568 0.10255 -8.328 < 2e-16 ***
## isWarm -1.63839 0.19429 0.25785 -6.354 2.1e-10 ***
## windSpeed -0.20731 0.81277 0.05590 -3.709 0.000208 ***
## humidity -0.30283 0.73873 0.08680 -3.489 0.000485 ***
## visibility -0.06500 0.93706 0.06053 -1.074 0.282829
## cloudHeight 0.23976 1.27094 0.08047 2.980 0.002886 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## longitude 1.1214 0.8917 0.9807 1.2823
## latitude 1.2294 0.8134 1.0474 1.4431
## pressure 0.8229 1.2152 0.7107 0.9528
## temperature 0.4257 2.3492 0.3482 0.5205
## isWarm 0.1943 5.1469 0.1172 0.3221
## windSpeed 0.8128 1.2304 0.7284 0.9069
## humidity 0.7387 1.3537 0.6232 0.8757
## visibility 0.9371 1.0672 0.8322 1.0551
## cloudHeight 1.2709 0.7868 1.0855 1.4881
##
## Concordance= 0.755 (se = 0.018 )
## Likelihood ratio test= 198.6 on 9 df, p=<2e-16
## Wald test = 199 on 9 df, p=<2e-16
## Score (logrank) test = 196.1 on 9 df, p=<2e-16
tab1 <- tidy(Mbest, exponentiate=T, conf.int=T) # conf.int : confident interval
# p-values are < 0.05 so we accept these covariates, there are statistically significant
tab1
## # A tibble: 9 × 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 longitude 1.12 0.0684 1.68 9.38e- 2 0.981 1.28
## 2 latitude 1.23 0.0818 2.53 1.15e- 2 1.05 1.44
## 3 pressure 0.823 0.0748 -2.61 9.15e- 3 0.711 0.953
## 4 temperature 0.426 0.103 -8.33 8.23e-17 0.348 0.520
## 5 isWarm 0.194 0.258 -6.35 2.10e-10 0.117 0.322
## 6 windSpeed 0.813 0.0559 -3.71 2.08e- 4 0.728 0.907
## 7 humidity 0.739 0.0868 -3.49 4.85e- 4 0.623 0.876
## 8 visibility 0.937 0.0605 -1.07 2.83e- 1 0.832 1.06
## 9 cloudHeight 1.27 0.0805 2.98 2.89e- 3 1.09 1.49
Mbest %>% survfit2() %>% ggsurvfit() + add_confidence_interval() + add_risktable() + scale_y_continuous(limits = c(0, 1))
anova(Mall, Mbest)
## Analysis of Deviance Table
## Cox model: response is Surv(start, stop, event)
## Model 1: ~ latitude + longitude + isWarm + temperature + windSpeed + pressure + humidity + visibility + nebulosity + cloudHeight + moonPhase + strata(season)
## Model 2: ~ longitude + latitude + pressure + temperature + isWarm + windSpeed + humidity + visibility + cloudHeight + strata(season)
## loglik Chisq Df Pr(>|Chi|)
## 1 -1435.0
## 2 -1437.9 5.8174 2 0.05455 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p>0.05 -> reject H1, both models are significantly equals -> keep only the covariates from Mbest is sufficient.
# Without strata(season)
Mall_noSeason <- coxph(Surv(start, stop, event)~longitude + latitude + pressure + temperature + isWarm + windSpeed + humidity + visibility + cloudHeight + nebulosity + moonPhase, data=data)
Mbest_noSeason <- coxph(Surv(start, stop, event)~longitude + pressure + temperature + isWarm + windSpeed + humidity + visibility + cloudHeight , data=data)
anova(Mall_noSeason, Mbest_noSeason)
## Analysis of Deviance Table
## Cox model: response is Surv(start, stop, event)
## Model 1: ~ longitude + latitude + pressure + temperature + isWarm + windSpeed + humidity + visibility + cloudHeight + nebulosity + moonPhase
## Model 2: ~ longitude + pressure + temperature + isWarm + windSpeed + humidity + visibility + cloudHeight
## loglik Chisq Df Pr(>|Chi|)
## 1 -1624.3
## 2 -1625.9 3.0518 3 0.3837
# p-value is even higher than when considering strata(season)... -> do not take strata(season)
Mbest_noSeason %>% survfit2() %>% ggsurvfit() + add_confidence_interval() + add_risktable() + scale_y_continuous(limits = c(0, 1))
ggforest(Mall_noSeason , main="Hazard ratios and 95% confidence intervals", data=data) # FIXME Not working with strata(season)
ggforest(Mbest_noSeason, main="Hazard ratios and 95% confidence intervals", data=data) # FIXME Not working with strata(season)
# Confidence intervals are relatively small -> good !
# line is for HR=1 ;
# HR of isWarm is only <0.1 -> less than 10% of the hazard
ggcoef_model(Mbest_noSeason, exponentiate=TRUE)
ggcoef_model(Mall_noSeason , exponentiate=TRUE)
# Try to identify the covariates with time-varying effects
# test : H0 = HR for the covariate is constant over time
cox.zph(Mbest)
## chisq df p
## longitude 18.55 1 1.7e-05
## latitude 24.64 1 6.9e-07
## pressure 9.55 1 0.00199
## temperature 13.98 1 0.00019
## isWarm 7.46 1 0.00629
## windSpeed 7.05 1 0.00793
## humidity 16.21 1 5.7e-05
## visibility 1.57 1 0.21018
## cloudHeight 5.49 1 0.01915
## GLOBAL 72.26 9 5.5e-12
par(mfrow=c(3,3), mai = c(0.6, 0.6, 0.1, 0.1))
plot(cox.zph(Mbest))
# p-values are all < 0.05 --> reject H0 -> time variations
cox.zph(Mbest_noSeason)
## chisq df p
## longitude 14.4625 1 0.00014
## pressure 7.3747 1 0.00661
## temperature 28.2379 1 1.1e-07
## isWarm 0.0815 1 0.77532
## windSpeed 4.2628 1 0.03895
## humidity 25.7813 1 3.8e-07
## visibility 1.0897 1 0.29653
## cloudHeight 11.0147 1 0.00090
## GLOBAL 75.4463 8 4.0e-13
plot(cox.zph(Mbest_noSeason))
# Looking at data, temperature seems to be linear dependent with time, try:
Mbest <- coxph(Surv(start, stop, event)~longitude + latitude + pressure + temperature + isWarm + windSpeed + humidity + visibility + cloudHeight + strata(season), data=data)
Mbest_tt1 <- coxph(Surv(start, stop, event)~longitude + tt(longitude) + latitude + tt(latitude) + pressure + tt(pressure) + temperature + tt(temperature) + isWarm + tt(isWarm) + windSpeed + tt(windSpeed) + humidity + tt(humidity) + visibility + cloudHeight + strata(season), data=data, tt=function(x,t,...) x/t)
Mbest_tt2 <- coxph(Surv(start, stop, event)~longitude + tt(longitude) + latitude + tt(latitude) + pressure + tt(pressure) + temperature + tt(temperature) + isWarm + tt(isWarm) + windSpeed + tt(windSpeed) + humidity + tt(humidity) + visibility + cloudHeight + strata(season) , data=data, tt=function(x,t,...) x*t)
Mbest_tt3 <- coxph(Surv(start, stop, event)~longitude + tt(longitude) + latitude + tt(latitude) + pressure + tt(pressure) + temperature + tt(temperature) + isWarm + tt(isWarm) + windSpeed + tt(windSpeed) + humidity + tt(humidity) + visibility + cloudHeight + strata(season), data=data, tt=function(x,t,...) x*log(t))
anova(Mbest_tt1, Mbest_tt2) # p-value < 0.01 -> highly significant difference between the two models
## Analysis of Deviance Table
## Cox model: response is Surv(start, stop, event)
## Model 1: ~ longitude + tt(longitude) + latitude + tt(latitude) + pressure + tt(pressure) + temperature + tt(temperature) + isWarm + tt(isWarm) + windSpeed + tt(windSpeed) + humidity + tt(humidity) + visibility + cloudHeight + strata(season)
## Model 2: ~ longitude + tt(longitude) + latitude + tt(latitude) + pressure + tt(pressure) + temperature + tt(temperature) + isWarm + tt(isWarm) + windSpeed + tt(windSpeed) + humidity + tt(humidity) + visibility + cloudHeight + strata(season)
## loglik Chisq Df Pr(>|Chi|)
## 1 -1423.9
## 2 -1407.8 32.269 0 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(Mbest_tt1, Mbest_tt3) # p-value < 0.01 -> highly significant difference between the two models
## Analysis of Deviance Table
## Cox model: response is Surv(start, stop, event)
## Model 1: ~ longitude + tt(longitude) + latitude + tt(latitude) + pressure + tt(pressure) + temperature + tt(temperature) + isWarm + tt(isWarm) + windSpeed + tt(windSpeed) + humidity + tt(humidity) + visibility + cloudHeight + strata(season)
## Model 2: ~ longitude + tt(longitude) + latitude + tt(latitude) + pressure + tt(pressure) + temperature + tt(temperature) + isWarm + tt(isWarm) + windSpeed + tt(windSpeed) + humidity + tt(humidity) + visibility + cloudHeight + strata(season)
## loglik Chisq Df Pr(>|Chi|)
## 1 -1423.9
## 2 -1411.6 24.493 0 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(Mbest_tt2, Mbest_tt3) # p-value < 0.01 -> highly significant difference between the two models
## Analysis of Deviance Table
## Cox model: response is Surv(start, stop, event)
## Model 1: ~ longitude + tt(longitude) + latitude + tt(latitude) + pressure + tt(pressure) + temperature + tt(temperature) + isWarm + tt(isWarm) + windSpeed + tt(windSpeed) + humidity + tt(humidity) + visibility + cloudHeight + strata(season)
## Model 2: ~ longitude + tt(longitude) + latitude + tt(latitude) + pressure + tt(pressure) + temperature + tt(temperature) + isWarm + tt(isWarm) + windSpeed + tt(windSpeed) + humidity + tt(humidity) + visibility + cloudHeight + strata(season)
## loglik Chisq Df Pr(>|Chi|)
## 1 -1407.8
## 2 -1411.6 7.7764 0 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Termplot figures for model tt1 show slower values -> take this one as favorite.
termplot(Mbest_tt1, se=TRUE)
termplot(Mbest_tt2, se=TRUE)
termplot(Mbest_tt3, se=TRUE)
residuals_best <- residuals(Mbest , type="martingale")
residuals_tt1 <- residuals(Mbest_tt1, type="martingale")
residuals_tt2 <- residuals(Mbest_tt2, type="martingale")
residuals_tt3 <- residuals(Mbest_tt3, type="martingale")
par(mfrow = c(1, 4), mar = c(4, 4, 3, 1))
plot(residuals_best, ylim=c(-6,1), main="BEST")
plot(residuals_tt1 , ylim=c(-6,1), main="TT1")
plot(residuals_tt2 , ylim=c(-6,1), main="TT2")
plot(residuals_tt3 , ylim=c(-6,1), main="TT3")
# Results are similare for differents time-dependent function...
par(mfrow = c(1,1), mar = c(3, 4, 3, 1))
# residuals for NONE is far from 0 (=1) while BEST and ALL is closer to 0.
# Many outliers -> try to identify them
boxplot(residuals_best, residuals_tt1, names=c("BEST", "TT1"))
boxplot(residuals_tt1, residuals_tt2, residuals_tt3, names=c("TT1", "TT2", "TT3"))
par(mfrow = c(2,3), mar = c(3, 4, 3, 1))
hist(residuals_best, xlim=c(-4, 1), breaks=20, main="Best model")
hist(residuals_tt1 , xlim=c(-4, 1), breaks=6, main="Time-varying best model")
boxplot(residuals_best, residuals_tt1, names=c("Best", "Time-varying"))
predict_best <- predict(Mbest , type="lp")
predict_tt <- predict(Mbest_tt1, type = "lp")
par(mfrow = c(1, 2), mar = c(4, 4, 3, 1))
plot(predict_best, residuals_best, ylim=c(-2,1), xlab="Linear predictor", ylab="Martingale residuals", main="Best model")
lines(lowess(predict_best, residuals_best), col="red", lwd=2)
abline(h=0, lty=2)
plot(predict_tt , residuals_tt1 , ylim=c(-2,1), xlab="Linear predictor", ylab="Martingale residuals", main="Time-varying best model")
lines(lowess(predict_tt, residuals_tt1), col="red", lwd=2)
abline(h=0, lty=2)
# Residuals close to +1 indicate subjects who experienced the event but the model predicted a very low hazard (possibly early events or unusual cases).
# This could happen if the model underfits or the time-dependent term hasn't fully captured hazard variation.
# - Residuals near +1 = events with underestimated risk by the model.
# - Residuals near 0 = well-predicted outcomes.
# - Residuals near -1 = censored observations with large predicted hazard (over-predicted risk).